Define a gene co-expression similarity comparing biweight midcorrelation vs correlation vs euclidean distance
Performing hierarchical clustering using average linkage hclust(method=‘average’)
Extracting clusters using a dynamic brach cutting approach from Dynamic Tree Cut: in-depth description, tests and applications using Dynamic Tree
Merging similar clusters using Module Eigengenes comparing original clusters vs merged clusters vs two main clusters
Load preprocessed dataset (preprocessing code in /../FirstYearReview/data_preprocessing.Rmd)
Keep DE genes
datExpr = datExpr %>% filter(rownames(.) %in% rownames(DE_info)[DE_info$padj<0.05])
rownames(datExpr) = datGenes$feature_id[DE_info$padj<0.05 & !is.na(DE_info$padj)]
datGenes = datGenes %>% filter(feature_id %in% rownames(DE_info)[DE_info$padj<0.05])
DE_info = DE_info %>% filter(padj<0.05)
print(paste0('Keeping ', nrow(datExpr), ' genes and ', ncol(datExpr), ' samples'))
## [1] "Keeping 3000 genes and 80 samples"
allowWGCNAThreads()
## Allowing multi-threading with up to 64 threads.
cor_mat = datExpr %>% t %>% bicor
Correcting the correlation matrix from \(s \in [-1,1]\) to \(s \in [0,1]\) using \(s_{ij}=|bw(i,j)|\)
S = abs(cor_mat)
Using 1-S sa distance matrix
diss_S = 1-S
dend = diss_S %>% as.dist %>% hclust(method='average')
plot(dend, hang=0, labels=FALSE)
Using Dynamic Tree Cut, a dynamic branch cutting approach taken from Dynamic Tree Cut: in-depth description, tests and applications
modules = cutreeDynamicTree(dend, deepSplit=F, minModuleSize=20)
names(modules) = labels(dend)
clusterings[['bicor']] = modules
Calculate the “eigengenes” (1st principal component) of each module and merging similar modules
module_colors = c('gray', viridis(max(modules)))
MEs_output = datExpr %>% t %>% moduleEigengenes(colors=module_colors[as.vector(modules[labels(dend)])+1])
MEs = MEs_output$eigengenes
colnames(MEs) = sapply(colnames(MEs), function(x) paste0('Mod',which(module_colors == substring(x,3))) )
# Merge similar modules
bicor_dist = 1-bicor(MEs)
dend_MEs = bicor_dist %>% as.dist %>% hclust(method='average')
dend_MEs %>% as.dendrogram %>% set('labels', rep('', nrow(bicor_dist))) %>% plot(ylim=c(0, 0.6))
abline(h=0.5, col='#0099cc')
abline(h=0.2, col='#009999')
# Two main modules
tree_cut = cutree(dend_MEs, h=0.5)
top_modules = modules %>% replace(modules %in% (gsub('Mod','', names(tree_cut[tree_cut==1])) %>% as.numeric), 1) %>%
replace(modules %in% (gsub('Mod','', names(tree_cut[tree_cut==2])) %>% as.numeric), 2)
clusterings[['bicor_top_clusters']] = top_modules
# Closest modules
tree_cut = cutree(dend_MEs, h=0.2)
merged_modules = modules
n=0
for(i in sort(unique(tree_cut))){
n=n+1
merged_modules = merged_modules %>%
replace(modules %in% (gsub('Mod','', names(tree_cut[tree_cut==i])) %>% as.numeric), n)
}
clusterings[['bicor_merged']] = merged_modules
top_module_colors = c('gray', viridis(max(top_modules)))
merged_module_colors = c('gray', viridis(max(merged_modules)))
dend_colors = data.frame('ID'=names(modules[labels(dend)]),
'OriginalModules' = module_colors[modules[dend$order]+1],
'MergedModules' = merged_module_colors[merged_modules[dend$order]+1],
'TopModules' = top_module_colors[top_modules[dend$order]+1])
dend %>% as.dendrogram(hang=0) %>% set('labels', rep('', nrow(diss_S))) %>% plot(ylim=c(min(dend$height),1))
colored_bars(colors=dend_colors[,-1])
rm(MEs, modules, module_colors, MEs_output, top_modules, merged_modules, tree_cut, top_module_colors, dend_colors,
i, diss_S)
cor_mat = datExpr %>% t %>% cor
Correcting the correlation matrix from \(s \in [-1,1]\) to \(s \in [0,1]\) using \(s_{ij}=|bw(i,j)|\)
S = abs(cor_mat)
diss_S = 1-S
dend = diss_S %>% as.dist %>% hclust(method='average')
plot(dend, hang=0, labels=FALSE)
Using Dynamic Tree Cut, a dynamic branch cutting approach taken from Dynamic Tree Cut: in-depth description, tests and applications
modules = cutreeDynamicTree(dend, deepSplit=F, minModuleSize=20)
names(modules) = labels(dend)
clusterings[['cor']] = modules
Merging modules with similar expression profiles using the “eigengenes” (1st principal component) of each module and merging similar modules
module_colors = c('gray', viridis(max(modules)))
MEs_output = datExpr %>% t %>% moduleEigengenes(colors=module_colors[as.vector(modules[labels(dend)])+1])
MEs = MEs_output$eigengenes
colnames(MEs) = sapply(colnames(MEs), function(x) paste0('Mod',which(module_colors == substring(x,3))) )
# Merge similar modules
bicor_dist = 1-bicor(MEs)
dend_MEs = bicor_dist %>% as.dist %>% hclust(method='average')
dend_MEs %>% as.dendrogram %>% set('labels', rep('', nrow(bicor_dist))) %>% plot(ylim=c(0, 0.55))
abline(h=0.5, col='#0099cc')
abline(h=0.2, col='#009999')
# Two main modules
tree_cut = cutree(dend_MEs, h=0.5)
top_modules = modules %>% replace(modules %in% (gsub('Mod','', names(tree_cut[tree_cut==1])) %>% as.numeric), 1) %>%
replace(modules %in% (gsub('Mod','', names(tree_cut[tree_cut==2])) %>% as.numeric), 2)
clusterings[['cor_top_clusters']] = top_modules
# Closest modules
tree_cut = cutree(dend_MEs, h=0.2)
merged_modules = modules
n=0
for(i in sort(unique(tree_cut))){
n=n+1
merged_modules = merged_modules %>%
replace(modules %in% (gsub('Mod','', names(tree_cut[tree_cut==i])) %>% as.numeric), n)
}
clusterings[['cor_merged']] = merged_modules
top_module_colors = c('gray', viridis(max(top_modules)))
merged_module_colors = c('gray', viridis(length(unique(merged_modules))))
dend_colors = data.frame('ID'=names(modules[labels(dend)]),
'OriginalModules' = module_colors[modules[dend$order]+1],
'MergedModules' = merged_module_colors[merged_modules[dend$order]+1],
'TopModules' = top_module_colors[top_modules[dend$order]+1])
dend %>% as.dendrogram(hang=0) %>% set('labels', rep('', nrow(diss_S))) %>% plot(ylim=c(min(dend$height),1))
colored_bars(colors=dend_colors[,-1])
rm(MEs, modules, module_colors, MEs_output, top_modules, merged_modules, tree_cut, top_module_colors, dend_colors,
i, diss_S)
cor_mat = datExpr %>% dist
Correcting the correlation matrix to \(s \in [0,1]\) using \(s_{ij}=\frac{dist(i,j)-min(dist)}{max(dist)-min(dist)}\)
S = (cor_mat-min(cor_mat))/(max(cor_mat)-min(cor_mat))
dend = S %>% hclust(method='average')
plot(dend, hang=-1, labels=FALSE)
Using Dynamic Tree Cut, a dynamic branch cutting approach taken from Dynamic Tree Cut: in-depth description, tests and applications
modules = cutreeDynamicTree(dend, deepSplit=F, minModuleSize=20)
names(modules) = labels(dend)
clusterings[['euc']] = modules
Merging modules with similar expression profiles using the “eigengenes” (1st principal component) of each module and merging similar modules
module_colors = c('gray', viridis(max(modules)))
MEs_output = datExpr %>% t %>% moduleEigengenes(colors=module_colors[as.vector(modules[labels(dend)])+1])
MEs = MEs_output$eigengenes
colnames(MEs) = sapply(colnames(MEs), function(x) paste0('Mod',which(module_colors == substring(x,3))) )
# Merge similar modules
bicor_dist = 1-bicor(MEs)
dend_MEs = bicor_dist %>% as.dist %>% hclust(method='average')
dend_MEs %>% as.dendrogram %>% set('labels', rep('', nrow(bicor_dist))) %>% plot(ylim=c(0, 0.18))
abline(h=0.15, col='#0099cc')
abline(h=0.04, col='#009999')
# Two main modules
tree_cut = cutree(dend_MEs, h=0.15)
top_modules = modules %>% replace(modules %in% (gsub('Mod','', names(tree_cut[tree_cut==1])) %>% as.numeric), 1) %>%
replace(modules %in% (gsub('Mod','', names(tree_cut[tree_cut==2])) %>% as.numeric), 2)
clusterings[['euc_top_clusters']] = top_modules
# Closest modules
tree_cut = cutree(dend_MEs, h=0.04)
merged_modules = modules
n=0
for(i in sort(unique(tree_cut))){
n=n+1
merged_modules = merged_modules %>%
replace(modules %in% (gsub('Mod','', names(tree_cut[tree_cut==i])) %>% as.numeric), n)
}
clusterings[['euc_merged']] = merged_modules
top_module_colors = c('gray', viridis(max(top_modules)))
merged_module_colors = c('gray', viridis(length(unique(merged_modules))))
dend_colors = data.frame('ID'=names(modules[labels(dend)]),
'OriginalModules' = module_colors[modules[dend$order]+1],
'MergedModules' = merged_module_colors[merged_modules[dend$order]+1],
'TopModules' = top_module_colors[top_modules[dend$order]+1])
dend %>% as.dendrogram(hang=-1) %>% set('labels', rep('', nrow(datExpr))) %>% plot(ylim=c(min(dend$height),0.6))
colored_bars(colors=dend_colors[,-1])
rm(MEs, modules, module_colors, MEs_output, top_modules, merged_modules, tree_cut, top_module_colors, dend_colors, i)
cluster_sim = data.frame(matrix(nrow = length(clusterings), ncol = length(clusterings)))
for(i in 1:(length(clusterings))){
cluster1 = sub(0, NA, clusterings[[i]]) %>% as.factor
for(j in (i):length(clusterings)){
cluster2 = sub(0, NA, clusterings[[j]]) %>% as.factor
cluster_sim[i,j] = adj.rand.index(cluster1, cluster2)
}
}
colnames(cluster_sim) = names(clusterings)
rownames(cluster_sim) = colnames(cluster_sim)
cluster_sim = cluster_sim %>% as.matrix %>% round(2)
heatmap.2(x = cluster_sim, Rowv = F, Colv = F, dendrogram = 'none', col=rev(brewer.pal(9,'YlOrRd'))[4:9],
cellnote = cluster_sim, notecol = 'black', trace = 'none', key = FALSE,
cexRow = 1, cexCol = 1, margins = c(12,12))
rm(i, j, cluster1, cluster2, cluster_sim)
Cluster don’t follow any strong patterns, at least in the first principal components
pca = datExpr %>% prcomp
plot_data = data.frame('ID' = rownames(datExpr), 'PC1' = pca$x[,1], 'PC2' = pca$x[,2],
'PC3' = pca$x[,3], 'PC4' = pca$x[,4], 'PC5' = pca$x[,5],
'bicor' = sub(0,NA,clusterings[['bicor']]) %>% as.factor,
'bicor_top_clusters' = sub(0,NA,clusterings[['bicor_top_clusters']]) %>% as.factor,
'bicor_merged' = sub(0,NA,clusterings[['bicor_merged']]) %>% as.factor,
'cor' = sub(0,NA,clusterings[['cor']]) %>% as.factor,
'cor_top_clusters' = sub(0,NA,clusterings[['cor_top_clusters']]) %>% as.factor,
'cor_merged' = sub(0,NA,clusterings[['cor_merged']]) %>% as.factor,
'euc' = sub(0,NA,clusterings[['euc']]) %>% as.factor,
'euc_top_clusters' = sub(0,NA,clusterings[['euc_top_clusters']]) %>% as.factor,
'euc_merged' = sub(0,NA,clusterings[['euc_merged']]) %>% as.factor)
selectable_scatter_plot(plot_data[,-1,], plot_data[,-1])
rm(pca, plot_data)
Using Euclidean distance creates clusters defined by their mean expression, perhaps it’s not a good metric because it doesn’t seem to capture any more complex behaviours in the data
It is interesting that the Euclidean distance top clusters don’t group together on the 1st PC and instead form stripes along it
Correlation based distance metrics seem to capture a type of structure in the data not related to the first principal components
I think biweight midcorrelation may be the best choice because it’s more robust than correlation
clusterings_file = './../../FirstYearReview/Data/Gandal/clusterings.csv'
if(file.exists(clusterings_file)){
df = read.csv(clusterings_file, row.names=1)
if(!all(rownames(df) == rownames(datExpr))) stop('Gene ordering does not match the one in clusterings.csv!')
for(clustering in names(clusterings)){
df = df %>% mutate(!!clustering := sub(0, NA, clusterings[[clustering]]))
rownames(df) = rownames(datExpr)
}
} else {
df = clusterings %>% unlist %>% matrix(nrow=length(clusterings), byrow=T) %>% t %>% data.frame %>% na_if(0)
colnames(df) = names(clusterings)
rownames(df) = rownames(datExpr)
}
write.csv(df, file=clusterings_file)
rm(clusterings_file, df, clustering)
sessionInfo()
## R version 3.5.2 (2018-12-20)
## Platform: x86_64-redhat-linux-gnu (64-bit)
## Running under: Scientific Linux 7.6 (Nitrogen)
##
## Matrix products: default
## BLAS/LAPACK: /usr/lib64/R/lib/libRblas.so
##
## locale:
## [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8
## [5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
## [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] pdfCluster_1.0-3 doParallel_1.0.14
## [3] iterators_1.0.9 foreach_1.4.4
## [5] WGCNA_1.66 fastcluster_1.1.25
## [7] dynamicTreeCut_1.63-1 sva_3.30.1
## [9] genefilter_1.64.0 mgcv_1.8-26
## [11] nlme_3.1-137 DESeq2_1.22.2
## [13] SummarizedExperiment_1.12.0 DelayedArray_0.8.0
## [15] BiocParallel_1.16.6 matrixStats_0.54.0
## [17] Biobase_2.42.0 GenomicRanges_1.34.0
## [19] GenomeInfoDb_1.18.2 IRanges_2.16.0
## [21] S4Vectors_0.20.1 BiocGenerics_0.28.0
## [23] biomaRt_2.38.0 gplots_3.0.1
## [25] dendextend_1.10.0 gridExtra_2.3
## [27] viridis_0.5.1 viridisLite_0.3.0
## [29] RColorBrewer_1.1-2 plotlyutils_0.0.0.9000
## [31] plotly_4.8.0 glue_1.3.1
## [33] reshape2_1.4.3 forcats_0.3.0
## [35] stringr_1.4.0 dplyr_0.8.0.1
## [37] purrr_0.3.1 readr_1.3.1
## [39] tidyr_0.8.3 tibble_2.1.1
## [41] ggplot2_3.1.0 tidyverse_1.2.1
##
## loaded via a namespace (and not attached):
## [1] readxl_1.1.0 backports_1.1.2 Hmisc_4.1-1
## [4] plyr_1.8.4 lazyeval_0.2.2 splines_3.5.2
## [7] robust_0.4-18 digest_0.6.18 htmltools_0.3.6
## [10] GO.db_3.7.0 gdata_2.18.0 magrittr_1.5
## [13] checkmate_1.8.5 memoise_1.1.0 fit.models_0.5-14
## [16] cluster_2.0.7-1 limma_3.38.3 annotate_1.60.1
## [19] modelr_0.1.4 prettyunits_1.0.2 colorspace_1.4-1
## [22] rrcov_1.4-3 blob_1.1.1 rvest_0.3.2
## [25] haven_1.1.1 xfun_0.5 crayon_1.3.4
## [28] RCurl_1.95-4.10 jsonlite_1.6 impute_1.56.0
## [31] survival_2.43-3 gtable_0.2.0 zlibbioc_1.28.0
## [34] XVector_0.22.0 kernlab_0.9-27 prabclus_2.2-7
## [37] DEoptimR_1.0-8 abind_1.4-5 scales_1.0.0
## [40] mvtnorm_1.0-7 DBI_1.0.0 Rcpp_1.0.1
## [43] xtable_1.8-3 progress_1.2.0 htmlTable_1.11.2
## [46] magic_1.5-9 foreign_0.8-71 bit_1.1-14
## [49] mclust_5.4 preprocessCore_1.44.0 Formula_1.2-3
## [52] htmlwidgets_1.3 httr_1.4.0 fpc_2.1-11.1
## [55] acepack_1.4.1 modeltools_0.2-22 pkgconfig_2.0.2
## [58] XML_3.98-1.11 flexmix_2.3-15 nnet_7.3-12
## [61] locfit_1.5-9.1 tidyselect_0.2.5 rlang_0.3.2
## [64] AnnotationDbi_1.44.0 munsell_0.5.0 cellranger_1.1.0
## [67] tools_3.5.2 cli_1.1.0 generics_0.0.2
## [70] RSQLite_2.1.1 broom_0.5.1 geometry_0.4.0
## [73] evaluate_0.13 yaml_2.2.0 knitr_1.22
## [76] bit64_0.9-7 robustbase_0.93-0 caTools_1.17.1
## [79] whisker_0.3-2 xml2_1.2.0 compiler_3.5.2
## [82] rstudioapi_0.7 geneplotter_1.60.0 pcaPP_1.9-73
## [85] stringi_1.4.3 lattice_0.20-38 trimcluster_0.1-2.1
## [88] Matrix_1.2-15 pillar_1.3.1 data.table_1.12.0
## [91] bitops_1.0-6 R6_2.4.0 latticeExtra_0.6-28
## [94] KernSmooth_2.23-15 codetools_0.2-15 MASS_7.3-51.1
## [97] gtools_3.5.0 assertthat_0.2.1 withr_2.1.2
## [100] GenomeInfoDbData_1.2.0 diptest_0.75-7 hms_0.4.2
## [103] grid_3.5.2 rpart_4.1-13 class_7.3-14
## [106] rmarkdown_1.12 lubridate_1.7.4 base64enc_0.1-3